import numpy as np
import pandas as pd
def grouped_sum(array, groups, axis=0, issorted=False):
array = np.asarray(array)
groups = np.asarray(groups)
if issorted:
aux = groups
ordered_array = array
else:
perm = groups.argsort()
aux = groups[perm]
ordered_array = array[perm]
flag = np.concatenate(([True], aux[1:] != aux[:-1]))
uniques = aux[flag]
inv_idx, = flag.nonzero()
result = np.add.reduceat(ordered_array, inv_idx)
return uniques, result
x = np.random.RandomState(0).randn(int(1e7))
y = np.random.RandomState(2).randint(10, size=int(1e7))
df = pd.DataFrame({'x': x, 'y': y})
y_sorted = np.sort(y)
df_sorted = pd.DataFrame({'x': x, 'y': y_sorted})
df.groupby('y').x.sum()
y 0 325.395301 1 116.311628 2 -360.622610 3 342.183991 4 -200.706264 5 964.375425 6 170.656262 7 1651.389744 8 -715.373282 9 734.414114 Name: x, dtype: float64
grouped_sum(x, y)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([ 325.39530127, 116.31162771, -360.62260997, 342.18399102, -200.70626376, 964.37542492, 170.65626202, 1651.38974376, -715.37328207, 734.41411426]))
s1 = df.groupby('y').x.sum()
s2 = pd.Series(*grouped_sum(x, y)[::-1])
assert abs(s1 - s2).mean() < 1e-10
%timeit df.groupby('y').x.sum()
%timeit grouped_sum(x, y)
10 loops, best of 3: 189 ms per loop 1 loops, best of 3: 1.04 s per loop
s1 = df_sorted.groupby('y').x.sum()
s2 = pd.Series(*grouped_sum(x, y_sorted)[::-1])
s3 = pd.Series(*grouped_sum(x, y_sorted, issorted=True)[::-1])
assert abs(s1 - s2).mean() < 1e-10
assert abs(s1 - s3).mean() < 1e-10
%timeit df_sorted.groupby('y').x.sum()
%timeit grouped_sum(x, y_sorted)
%timeit grouped_sum(x, y_sorted, issorted=True)
10 loops, best of 3: 191 ms per loop 1 loops, best of 3: 370 ms per loop 10 loops, best of 3: 28.2 ms per loop